register '$PIGGYBANK_PATH';

DEFINE DATE_EXTRACT_MM 
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM');

DEFINE DATE_EXTRACT_DD 
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd');

-- pig/input/test-pig-full.txt
raw_logs = load '$INPUT' USING org.apache.pig.piggybank.storage.MyRegExLoader('^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] "(\\S+) (\\S+) (HTTP[^"]+)" (\\S+) (\\S+) "([^"]*)" "([^"]*)" "(\\S+)" "(\\S+)" (\\S+) "(.*)" (\\S+) (\\S+)')
as (remoteAddr: chararray, 
n2: chararray, 
n3: chararray, 
time: chararray, 
method: chararray,
path:chararray,
protocol:chararray,
status: int, 
bytes_string: chararray, 
referrer: chararray, 
browser: chararray, 
n10:chararray,
remoteLogname: chararray, 
remoteAddr12: chararray, 
path2: chararray, 
sessionid: chararray, 
n15: chararray
);

filter_logs = FILTER raw_logs BY not (browser matches '.*pingdom.*');
--item_logs = FOREACH raw_logs GENERATE browser;

--percent 500 logs
reitem_percent_500_logs = FOREACH filter_logs GENERATE status,DATE_EXTRACT_MM(time) as month;
group_month_percent_500_logs = GROUP reitem_percent_500_logs BY (month);
final_month_500_logs = FOREACH group_month_percent_500_logs 
{
	total = COUNT(reitem_percent_500_logs);
	t = filter reitem_percent_500_logs by status== 500; --create a bag which contains only T values
	generate flatten(group) as col1, 100*(double)COUNT(t)/(double)total;
}
STORE final_month_500_logs into '$OUTPUT' using PigStorage(',');
